Set the working dir
setwd("/mnt/picea/projects/aspseq/emellerowicz/Cazymes/Selected-Tissues")
suppressPackageStartupMessages(library(gplots))
suppressPackageStartupMessages(library(hyperSpec))
source("~/Git/UPSCb/src/R/expressionSpecificityUtility.R")
source("~/Git/UPSCb/src/R/percentile.R")
hpal <- colorRampPalette(colors = c("blue","white","red"))(100)
dat <- read.delim2("~/Git/UPSCb/projects/aspen-cazymes/doc/Cazy-selected.txt")
sdat <- t(scale(t(as.matrix(dat[,4:26]))))
Remove non expressed cazymes
sel <- !rowSums(is.na(sdat)) == 23
hc.s <- hclust(d=pearson.dist(t(sdat[sel,])),method = "ward.D2")
plot(hc.s)
hc.g <- hclust(d=pearson.dist(sdat[sel,]),method = "ward.D2")
plot(hc.g,labels=dat$CAZY[sel])
heatmap.2(sdat[sel,],
distfun = pearson.dist,
hclustfun = function(X){hclust(X,method = "ward.D2")},
labRow=dat$CAZY[sel],trace="none",
las=2,col=hpal,
key=TRUE,main="Tissue expression",
cexCol = 0.8,cexRow = 0.4,srtCol = 45,margins = c(10.1,4.1))
pdf("cazymes-all-tissues.pdf",width = 110,height=110)
heatmap.2(sdat[sel,],
distfun = pearson.dist,
hclustfun = function(X){hclust(X,method = "ward.D2")},
labRow=dat$ID[sel],trace="none",
las=2,col=hpal,
key=FALSE,main="Tissue expression",
cexCol = 0.8,cexRow = 0.4,srtCol = 45,margins = c(10.1,4.1))
dev.off()
## png
## 2
Re-create the hierarchical clustering
f.hclust <- hclust(pearson.dist(sdat[sel,]),"ward.D2")
ct <- cutree(f.hclust,k=50)
check the cluster sizes
range(table(ct))
## [1] 4 56
Per cluster, print the heatmap and export the gene list
sapply(1:50,function(i){
sel3 <- ct==i
res <- heatmap.2(sdat[sel,][sel3,],
distfun = pearson.dist,
hclustfun = function(X){hclust(X,method = "ward.D2")},
labRow=dat$ID[sel][sel3],trace="none",
las=2,col=hpal,
key=FALSE,main=sprintf("Tissue expression cluster %02d",i),
cexCol = 0.8,cexRow = 0.8,srtCol = 45,margins = c(12.1,8.1))
pdf(sprintf("cazymes-all-tissues-%02d.pdf",i),width = 12,height=8)
heatmap.2(sdat[sel,][sel3,],
distfun = pearson.dist,
hclustfun = function(X){hclust(X,method = "ward.D2")},
labRow=dat$ID[sel][sel3],trace="none",
las=2,col=hpal,
key=FALSE,main=sprintf("Tissue expression cluster %02d",i),
cexCol = 0.8,cexRow = 0.8,srtCol = 45,margins = c(12.1,8.1))
dev.off()
write(rev(dat$ID[sel][sel3][res$rowInd]),file = sprintf("cazymes-all-tissues-cluster%02d.txt",i))
})
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
##
## [[6]]
## NULL
##
## [[7]]
## NULL
##
## [[8]]
## NULL
##
## [[9]]
## NULL
##
## [[10]]
## NULL
##
## [[11]]
## NULL
##
## [[12]]
## NULL
##
## [[13]]
## NULL
##
## [[14]]
## NULL
##
## [[15]]
## NULL
##
## [[16]]
## NULL
##
## [[17]]
## NULL
##
## [[18]]
## NULL
##
## [[19]]
## NULL
##
## [[20]]
## NULL
##
## [[21]]
## NULL
##
## [[22]]
## NULL
##
## [[23]]
## NULL
##
## [[24]]
## NULL
##
## [[25]]
## NULL
##
## [[26]]
## NULL
##
## [[27]]
## NULL
##
## [[28]]
## NULL
##
## [[29]]
## NULL
##
## [[30]]
## NULL
##
## [[31]]
## NULL
##
## [[32]]
## NULL
##
## [[33]]
## NULL
##
## [[34]]
## NULL
##
## [[35]]
## NULL
##
## [[36]]
## NULL
##
## [[37]]
## NULL
##
## [[38]]
## NULL
##
## [[39]]
## NULL
##
## [[40]]
## NULL
##
## [[41]]
## NULL
##
## [[42]]
## NULL
##
## [[43]]
## NULL
##
## [[44]]
## NULL
##
## [[45]]
## NULL
##
## [[46]]
## NULL
##
## [[47]]
## NULL
##
## [[48]]
## NULL
##
## [[49]]
## NULL
##
## [[50]]
## NULL
samples.cluster <- cutree(hc.s,k=5)
sort(samples.cluster)
## Buds_Dormant_SLU.May Flowers_Dormant_SLU
## 1 1
## Flowers_Expanding_SLU Leaves_Freshly_Expanded_SLU
## 1 1
## Leaves_Young_Expanding_SLU Suckers_Whole_Sucker_SLU
## 1 1
## Buds_Pre_chilling_SLU_July30 Petiole_Mature_SLU
## 2 2
## Roots_Control_Greenhouse Roots_Drought_Greenhouse
## 2 2
## Twigs_Non_Girdled_SLU Flowers_Expanded_SLU
## 2 3
## Seeds_Mature_SLU Leaves_Beetle_Damaged_Lab
## 3 4
## Leaves_Control_Greenhouse Leaves_Drought_Greenhouse
## 4 4
## Leaves_Mature_2_SLU.Aug3 Leaves_Mature_SLU.July30
## 4 4
## Leaves_Mechanical_Damage_Greenhouse Leaves_Undamaged_Greenhouse
## 4 4
## Cambium Phloem
## 5 5
## Xylem
## 5
es <- expressionSpecificity(as.matrix(dat[,4:26]),
tissues = as.character(factor(samples.cluster)),
mode = "local",output = "complete")
## Calculating tissue specificity
exp.peak <- as.character(1:5)[sapply(apply(es[,as.character(1:5)] == es[,"maxn"],1,which),"[",1)]
ord <- order(exp.peak,(1-es[,"score"]))
s.ord <- order(samples.cluster)
# heatmap.2(sdat[ord,s.ord],
# #distfun = pearson.dist,
# #hclustfun = function(X){hclust(X,method = "ward.D2")},
# dendrogram = "none",
# Colv = FALSE, Rowv = FALSE,
# labRow=dat$CAZY[ord],trace="none",
# colRow = names(samples.cluster)[s.ord],
# las=2,col=hpal,
# key=FALSE,main="Tissue expression",
# cexCol = 0.8,cexRow = 0.4,srtCol = 45,margins = c(10.1,4.1))
tissues <- names(samples.cluster)[c(4,14,11,23,19,21)]
Remove cazymes with insufficient variance
percentile(rowSds(sdat[sel,match(tissues,colnames(sdat))]))
## 0% 1% 2% 3% 4%
## 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 3.040471e-17
## 5% 6% 7% 8% 9%
## 3.040471e-17 6.080942e-17 2.407618e-01 3.790760e-01 4.412825e-01
## 10% 11% 12% 13% 14%
## 4.825874e-01 5.224530e-01 5.517175e-01 5.683728e-01 5.891265e-01
## 15% 16% 17% 18% 19%
## 6.271915e-01 6.518213e-01 6.707525e-01 6.958483e-01 7.093240e-01
## 20% 21% 22% 23% 24%
## 7.235366e-01 7.308492e-01 7.480326e-01 7.592478e-01 7.775784e-01
## 25% 26% 27% 28% 29%
## 7.846098e-01 7.990258e-01 8.073055e-01 8.178325e-01 8.282935e-01
## 30% 31% 32% 33% 34%
## 8.445277e-01 8.531262e-01 8.604260e-01 8.733740e-01 8.790009e-01
## 35% 36% 37% 38% 39%
## 8.866508e-01 8.950814e-01 9.015552e-01 9.098341e-01 9.158462e-01
## 40% 41% 42% 43% 44%
## 9.264510e-01 9.374317e-01 9.456656e-01 9.494436e-01 9.529627e-01
## 45% 46% 47% 48% 49%
## 9.601532e-01 9.680697e-01 9.759768e-01 9.832499e-01 9.913180e-01
## 50% 51% 52% 53% 54%
## 1.001037e+00 1.005045e+00 1.009298e+00 1.017451e+00 1.028613e+00
## 55% 56% 57% 58% 59%
## 1.034414e+00 1.044330e+00 1.054655e+00 1.063295e+00 1.068333e+00
## 60% 61% 62% 63% 64%
## 1.076516e+00 1.079805e+00 1.085405e+00 1.098573e+00 1.108120e+00
## 65% 66% 67% 68% 69%
## 1.116825e+00 1.127411e+00 1.134946e+00 1.138400e+00 1.143678e+00
## 70% 71% 72% 73% 74%
## 1.147028e+00 1.153913e+00 1.163572e+00 1.170221e+00 1.178502e+00
## 75% 76% 77% 78% 79%
## 1.186533e+00 1.194572e+00 1.204599e+00 1.209675e+00 1.223638e+00
## 80% 81% 82% 83% 84%
## 1.237802e+00 1.245941e+00 1.259554e+00 1.272969e+00 1.288225e+00
## 85% 86% 87% 88% 89%
## 1.304631e+00 1.322101e+00 1.334147e+00 1.344490e+00 1.363526e+00
## 90% 91% 92% 93% 94%
## 1.388256e+00 1.410480e+00 1.440648e+00 1.467026e+00 1.491709e+00
## 95% 96% 97% 98% 99%
## 1.516316e+00 1.530270e+00 1.564936e+00 1.629950e+00 1.721797e+00
## 100%
## 1.957890e+00
sel2 <- rowSds(sdat[sel,match(tissues,colnames(sdat))]) >= 0.05
Plot for the selected tissues and remaining cazymes
heatmap.2(sdat[sel,match(tissues,colnames(sdat))][sel2,],
distfun = pearson.dist,
hclustfun = function(X){hclust(X,method = "ward.D2")},
labRow=dat$CAZY[sel][sel2],trace="none",
las=2,col=hpal,
key=FALSE,main="Tissue expression",
cexCol = 0.8,cexRow = 0.4,srtCol = 45,margins = c(10.1,4.1))
pdf("cazymes-selected-tissues-all.pdf",width = 110,height=110)
heatmap.2(sdat[sel,match(tissues,colnames(sdat))][sel2,],
distfun = pearson.dist,
hclustfun = function(X){hclust(X,method = "ward.D2")},
labRow=dat$ID[sel][sel2],trace="none",
las=2,col=hpal,
key=FALSE,main="Tissue expression",
cexCol = 0.8,cexRow = 0.4,srtCol = 45,margins = c(10.1,4.1))
dev.off()
## png
## 2
Re-create the hierarchical clustering
f.hclust <- hclust(pearson.dist(sdat[sel,match(tissues,colnames(sdat))][sel2,]),"ward.D2")
ct <- cutree(f.hclust,k=40)
check the cluster sizes
range(table(ct))
## [1] 4 46
Per cluster, print the heatmap and export the gene list
sapply(1:40,function(i){
sel3 <- ct==i
res <- heatmap.2(sdat[sel,match(tissues,colnames(sdat))][sel2,][sel3,],
distfun = pearson.dist,
hclustfun = function(X){hclust(X,method = "ward.D2")},
labRow=dat$ID[sel][sel2][sel3],trace="none",
las=2,col=hpal,
key=FALSE,main=sprintf("Tissue expression cluster %02d",i),
cexCol = 0.8,cexRow = 0.8,srtCol = 45,margins = c(12.1,8.1))
pdf(sprintf("cazymes-selected-tissues-%02d.pdf",i),width = 12,height=8)
heatmap.2(sdat[sel,match(tissues,colnames(sdat))][sel2,][sel3,],
distfun = pearson.dist,
hclustfun = function(X){hclust(X,method = "ward.D2")},
labRow=dat$ID[sel][sel2][sel3],trace="none",
las=2,col=hpal,
key=FALSE,main=sprintf("Tissue expression cluster %02d",i),
cexCol = 0.8,cexRow = 0.8,srtCol = 45,margins = c(12.1,8.1))
dev.off()
write(rev(dat$ID[sel][sel2][sel3][res$rowInd]),file = sprintf("cluster%02d.txt",i))
})
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
##
## [[6]]
## NULL
##
## [[7]]
## NULL
##
## [[8]]
## NULL
##
## [[9]]
## NULL
##
## [[10]]
## NULL
##
## [[11]]
## NULL
##
## [[12]]
## NULL
##
## [[13]]
## NULL
##
## [[14]]
## NULL
##
## [[15]]
## NULL
##
## [[16]]
## NULL
##
## [[17]]
## NULL
##
## [[18]]
## NULL
##
## [[19]]
## NULL
##
## [[20]]
## NULL
##
## [[21]]
## NULL
##
## [[22]]
## NULL
##
## [[23]]
## NULL
##
## [[24]]
## NULL
##
## [[25]]
## NULL
##
## [[26]]
## NULL
##
## [[27]]
## NULL
##
## [[28]]
## NULL
##
## [[29]]
## NULL
##
## [[30]]
## NULL
##
## [[31]]
## NULL
##
## [[32]]
## NULL
##
## [[33]]
## NULL
##
## [[34]]
## NULL
##
## [[35]]
## NULL
##
## [[36]]
## NULL
##
## [[37]]
## NULL
##
## [[38]]
## NULL
##
## [[39]]
## NULL
##
## [[40]]
## NULL
Session Info
## R version 3.4.3 (2017-11-30)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/openblas-base/libblas.so.3
## LAPACK: /usr/lib/libopenblasp-r0.2.18.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] matrixStats_0.53.1 hyperSpec_0.99-20171005 ggplot2_2.2.1
## [4] lattice_0.20-35 gplots_3.0.1
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.16 knitr_1.20 magrittr_1.5
## [4] munsell_0.4.3 colorspace_1.3-2 R6_2.2.2
## [7] rlang_0.2.0 plyr_1.8.4 stringr_1.3.0
## [10] caTools_1.17.1 tools_3.4.3 gtable_0.2.0
## [13] KernSmooth_2.23-15 latticeExtra_0.6-28 htmltools_0.3.6
## [16] gtools_3.5.0 lazyeval_0.2.1 yaml_2.1.19
## [19] rprojroot_1.3-2 digest_0.6.15 tibble_1.4.2
## [22] RColorBrewer_1.1-2 bitops_1.0-6 testthat_2.0.0
## [25] evaluate_0.10.1 rmarkdown_1.9 gdata_2.18.0
## [28] stringi_1.2.2 pillar_1.2.2 compiler_3.4.3
## [31] scales_0.5.0 backports_1.1.2